Import Dataset

german <- read.csv("germancredit.csv", header=T)
head(german)[1:7]
##   Default checkingstatus1 duration history purpose amount savings
## 1       0             A11        6     A34     A43   1169     A65
## 2       1             A12       48     A32     A43   5951     A61
## 3       0             A14       12     A34     A46   2096     A61
## 4       0             A11       42     A32     A42   7882     A61
## 5       1             A11       24     A33     A40   4870     A61
## 6       0             A14       36     A32     A46   9055     A65

One-Hot Encoding

Below I am making dummy variables (or one-hot encodings) of the categorical variables. This will be used for many circumstances where qualitative data isn’t allowed. For instance, performing PCA, or SMOTE. In James Gareth’s book “An Introduction To Statistical Learning” (ISLR), the authors discuss one-hot encoding in Chapter 6, specifically in the context of linear regression models. In Section 6.2.2, the authors explain that when one-hot encoding a categorical variable with k levels, one should create k-1 binary variables. The reason for this is to avoid perfect multicollinearity in the model matrix, which can cause problems when fitting linear regression models.

# One-Hot Encoding Without Removing the First Column
dummy <- dummyVars(" ~ .", data=german)
dummy.german <- data.frame(predict(dummy, newdata = german)) 
head(dummy.german)[1:4]
##   Default checkingstatus1A11 checkingstatus1A12 checkingstatus1A13
## 1       0                  1                  0                  0
## 2       1                  0                  1                  0
## 3       0                  0                  0                  0
## 4       0                  1                  0                  0
## 5       1                  1                  0                  0
## 6       0                  0                  0                  0

Below is one-hot-encoding while removing the first column.

# Drop the first ('df' = 'drop first') column using the select() function
dummy.german.df <- subset(dummy.german, select=-c(
  checkingstatus1A11, historyA30, purposeA40, 
  savingsA61, employA71, statusA91, othersA101,
  propertyA121, otherplansA141, housingA151,
  jobA171, teleA191, foreignA201
))

head(dummy.german.df)[1:4]
##   Default checkingstatus1A12 checkingstatus1A13 checkingstatus1A14
## 1       0                  0                  0                  0
## 2       1                  1                  0                  0
## 3       0                  0                  0                  1
## 4       0                  0                  0                  0
## 5       1                  0                  0                  0
## 6       0                  0                  0                  1

Examine Dimensions

# Counting Just the rows that did not Default (Good!)
default.0 <- dim(german[german$Default == 0, ])[1]
default.1 <- dim(german[german$Default == 1, ])[1]
paste("The number of those that did not default are", default.0, "and those that did default are", default.1)
## [1] "The number of those that did not default are 700 and those that did default are 300"

Visualizing Dataset Class Imbalance

Balance.Plot <- function(data) {
  class_counts <- data.frame(table(data$Default))
  class_counts <- as_tibble(class_counts)
  class_counts$class <- c("Default 0", "Default 1")
  ggplot(class_counts, aes(x = class, y = Freq, fill=as.factor(class))) +
    geom_bar(stat = "identity") +
    labs(x = "Class", y = "Count", title = "Class Counts") +
    scale_fill_manual(values=c("#F8766D", "#00BFC4")) +
      theme_minimal()
}
Balance.Plot(german)

Balance Classes

# Apply SMOTE to balance the dataset
balanced.dummy.german <- ROSE(Default ~ ., data = dummy.german, seed = 123)$data
Balance.Plot(balanced.dummy.german)

Value Counts For Each Class Type

Count.Plot <- function(data, column.name) {  
  german.0.default <- data[data$Default == 0, ]
  german.1.default <- data[data$Default == 1, ]
  
  checkingstatus.counts.0 <- table(german.0.default[column.name])
  checkingstatus.counts.1 <- table(german.1.default[column.name])
  
  counts.df.0 <- as.data.frame(checkingstatus.counts.0)
  counts.df.1 <- as.data.frame(checkingstatus.counts.1)
  
  colnames(counts.df.0) <- c(column.name, "count")
  colnames(counts.df.1) <- c(column.name, "count")
  
  counts.df.0 <- counts.df.0 %>% mutate(class = 0)
  counts.df.1 <- counts.df.1 %>% mutate(class = 1)
  
  # Combine the two data frames
  combined_df <- rbind(counts.df.0, counts.df.1)
  
  # Create the plot
  ggplot(combined_df, aes(x=combined_df[,column.name], y=count, fill=as.factor(class))) +
    geom_bar(stat="identity", position="dodge") +
    labs(x = column.name, y="Count", fill="Class") +
    scale_fill_manual(values=c("#F8766D", "#00BFC4")) +
    theme_minimal()
}

Plotting Standard Dataset Value Counts

Count.Plot(german, "checkingstatus1")

Count.Plot(german, "history")

Count.Plot(german, "purpose")

Count.Plot(german, "savings")

Count.Plot(german, "employ")

Plotting Numerical Variable Distributions

Density.Plot <- function(data, column.name) {
  
  # Create subsets of the dataframe based on the binary class
  df_default0 <- data[data[["Default"]] == 0,]
  df_default1 <- data[data[["Default"]] == 1,]
  
  # Plot the two density plots on the same plot
  ggplot() +
    geom_density(data = df_default0, aes(x = df_default0[,column.name], fill = "Default 0"), alpha = 0.5) +
    geom_density(data = df_default1, aes(x = df_default1[,column.name], fill = "Default 1"), alpha = 0.5) +
    labs(title = paste("Distribution of", column.name, "by Default"),
         x = column.name,
         y = "Density") +
    scale_fill_manual(values = c("#F8766D", "#00BFC4"), name = "Default") +
    theme_minimal()
}
Density.Plot(german, "duration")

Density.Plot(german, "amount")

Density.Plot(german, "installment")

Density.Plot(german, "residence")

Density.Plot(german, "age")

Density.Plot(german, "cards")

Density.Plot(german, "liable")

RFE

RFE <- function(data, num.features=4) {
  # Define the predictor and response variables
  train.X <- data[, !(names(data) %in% c("Default"))]
  train.Y <- as.factor(data[, "Default"])
  
  # Define the control parameters for feature selection
  ctrl <- rfeControl(functions = rfFuncs,
                     method = "cv",
                     number = 10)
  
  # Perform recursive feature elimination using the random forest algorithm
  rf_rfe <- rfe(train.X, train.Y, sizes = c(1:num.features), rfeControl = ctrl)
  
  # Print the results
  print(rf_rfe)
  
  # Plot the results
  p <- plot(rf_rfe, type = c("g", "o"))
  print(p)
  
  # Get the features
  features <- row.names(varImp(rf_rfe))[1:num.features]
  
  varimp_data <- data.frame(feature = features,
                          importance = varImp(rf_rfe)[1:num.features, 1])

  # Plots the variable importances
  gg <- ggplot(data = varimp_data, 
         aes(x = reorder(feature, -importance), y = importance, fill = feature)) +
    geom_bar(stat="identity") + labs(x = "Features", y = "Variable Importance") + 
    geom_text(aes(label = round(importance, 2)), vjust=1.6, color="white", size=4) + 
    theme_bw() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
  
  print(gg)
  
  target.features <- c("Default", features)
  
  return(subset(data, select=target.features))
}

RFE with Various Features

Running RFE on German Data with 4 features

RFE.4.German <- RFE(german)
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables Accuracy   Kappa AccuracySD KappaSD Selected
##          1    0.689 0.03178    0.02424 0.06864         
##          2    0.710 0.21972    0.03300 0.07790         
##          3    0.742 0.30027    0.03967 0.09527         
##          4    0.737 0.32986    0.04270 0.10282         
##         20    0.778 0.40747    0.03425 0.09421        *
## 
## The top 5 variables (out of 20):
##    checkingstatus1, duration, history, amount, savings

Running RFE on German Data with 10 features

RFE.10.German <- RFE(german, num.features=10)
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables Accuracy   Kappa AccuracySD KappaSD Selected
##          1    0.691 0.03012    0.02025 0.06453         
##          2    0.698 0.19492    0.04566 0.11076         
##          3    0.733 0.27972    0.03743 0.10687         
##          4    0.712 0.25745    0.05692 0.13524         
##          5    0.733 0.30849    0.04423 0.09798         
##          6    0.762 0.38809    0.05412 0.13560         
##          7    0.754 0.36673    0.05641 0.14819         
##          8    0.766 0.39372    0.03502 0.08374         
##          9    0.762 0.38769    0.03853 0.09649         
##         10    0.765 0.38995    0.03979 0.10783         
##         20    0.768 0.38077    0.03553 0.08872        *
## 
## The top 5 variables (out of 20):
##    checkingstatus1, duration, history, amount, savings

Running RFE on Dummy-Variabled German Data

RFE.4.Dummy.German <- RFE(dummy.german)
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables Accuracy   Kappa AccuracySD KappaSD Selected
##          1    0.700 0.00000    0.00000 0.00000         
##          2    0.689 0.01123    0.03510 0.04459         
##          3    0.710 0.10825    0.03197 0.08230         
##          4    0.720 0.24240    0.03682 0.09612         
##         61    0.762 0.34407    0.03615 0.10294        *
## 
## The top 5 variables (out of 61):
##    checkingstatus1A14, checkingstatus1A11, duration, historyA34, amount

Running RFE on Dummy-Variabled German Data with 2 Features

RFE.2.Dummy.German <- RFE(dummy.german, num.features=2)
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables Accuracy    Kappa AccuracySD  KappaSD Selected
##          1    0.700 0.000000   0.000000 0.000000         
##          2    0.698 0.001235   0.006325 0.003904         
##         61    0.762 0.353586   0.031903 0.090578        *
## 
## The top 5 variables (out of 61):
##    checkingstatus1A14, checkingstatus1A11, duration, historyA34, amount

Running RFE on Dummy-Variabled German Data with 10 Variables

RFE.10.Dummy.German <- RFE(dummy.german, num.features=10)
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables Accuracy   Kappa AccuracySD KappaSD Selected
##          1    0.700 0.00000    0.00000 0.00000         
##          2    0.704 0.02697    0.01265 0.08527         
##          3    0.719 0.13360    0.01792 0.07603         
##          4    0.720 0.23540    0.03464 0.12233         
##          5    0.717 0.21156    0.02869 0.10991         
##          6    0.729 0.25947    0.02514 0.09212         
##          7    0.742 0.29115    0.03084 0.11422         
##          8    0.743 0.30061    0.02869 0.09993         
##          9    0.731 0.29040    0.02514 0.08336         
##         10    0.738 0.30251    0.02251 0.06384         
##         61    0.767 0.37131    0.04322 0.12010        *
## 
## The top 5 variables (out of 61):
##    checkingstatus1A14, checkingstatus1A11, duration, historyA34, amount

RFE.10.Balanced.Dummy.German <- RFE(balanced.dummy.german, num.features = 10)
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables Accuracy  Kappa AccuracySD KappaSD Selected
##          1   0.5781 0.1562    0.06540 0.13045         
##          2   0.6580 0.3159    0.03272 0.06569         
##          3   0.7149 0.4300    0.04979 0.09990         
##          4   0.7670 0.5342    0.03854 0.07732         
##          5   0.7809 0.5618    0.03886 0.07782         
##          6   0.8030 0.6058    0.05592 0.11185         
##          7   0.8130 0.6258    0.04714 0.09424         
##          8   0.8270 0.6538    0.04705 0.09406         
##          9   0.8299 0.6596    0.04562 0.09118         
##         10   0.8249 0.6496    0.04901 0.09807         
##         61   0.8510 0.7021    0.04429 0.08862        *
## 
## The top 5 variables (out of 61):
##    checkingstatus1A14, foreignA201, historyA30, purposeA48, foreignA202

Running PCA Dimensionality Reduction

Running PCA On the Imbalanced Dataset

# Standardize the data
dummy.german_std <- scale(dummy.german)

# Perform PCA
german.pca <- prcomp(dummy.german_std, center = TRUE, scale. = TRUE)
# Extract the standard deviations of each principal component
sd <- summary(german.pca)$sdev

# Plot the standard deviations as a line plot
plot(sd, type = "b", xlab = "Principle Component", ylab = "Standard Deviation")

pov <- german.pca$sdev^2/sum(german.pca$sdev^2)
plot(pov, type = "b", xlab = "Principal Component", ylab = "Proportion of Variance")

Running PCA On the Balanced Dataset

# Standardize the data
dummy.german_std <- scale(balanced.dummy.german)

# Perform PCA
german.balanced.pca <- prcomp(dummy.german_std, center = TRUE, scale. = TRUE)
# Extract the standard deviations of each principal component
sd <- summary(german.balanced.pca)$sdev

# Plot the standard deviations as a line plot
plot(sd, type = "b", xlab = "Principle Component", ylab = "Standard Deviation")

pov <- german.balanced.pca$sdev^2/sum(german.balanced.pca$sdev^2)
plot(pov, type = "b", xlab = "Principal Component", ylab = "Proportion of Variance")

Passing PCA Results in NB

NB.Model.PCA <- function(pca, data) {
  # Extract the principal component scores
  pc_scores <- predict(pca, data)
  
  # Split the data into training and testing sets on PCA
  set.seed(123) # for reproducibility
  train_index <- sample(nrow(data), nrow(data) * 0.7) # 70% for training
  train_data <- pc_scores[train_index, ]
  train_label <- as.factor(data[train_index,"Default"])
  test_data <- pc_scores[-train_index, ]
  test_label <- as.factor(data[-train_index,"Default"])
  
  # Train the Naïve Bayes classifier using the training data
  nb_model <- naive_bayes(train_data, train_label)
  
  # # Evaluate the model performance on the test set for each value of K
  nb_pred <- predict(nb_model, newdata=test_data)
  nbConf <- confusionMatrix(nb_pred, test_label)
  
  nbPredictiction <- prediction(as.numeric(nb_pred), as.numeric(test_label))
  nbPerf <- performance(nbPredictiction, measure = "tpr", x.measure = "fpr")
  nbAUC <- performance(nbPredictiction, measure = "auc")
  
  print(plot(nbPerf))
  
  # Extract performance metrics
  sensitivity <- slot(nbPerf, "y.values")[[1]]
  specificity <- 1 - slot(nbPerf, "x.values")[[1]]
  auc <- slot(nbAUC, "y.values")
  nbError <- mean(as.numeric(nb_pred) !=as.numeric(test_label))
  
  # Print performance metrics
  print(nbConf)
  print(paste0("Sensitivity: ", sensitivity))
  print(paste0("Specificity: ", specificity))
  print(paste0("AUC: ", auc))
  print(paste0("Error rate:", nbError))
  
  # Calculate false positives
  false_positives <- sum(as.numeric(nb_pred) == 2 & as.numeric(test_label) == 1)

  # Calculate false positives as a percentage
  total_negatives <- sum(as.numeric(test_label) == 1)
  false_positives_percent <- false_positives / total_negatives * 100
  
  # Print the false positives percentage
  print(paste0("False positives percentage: ", round(false_positives_percent,3), "%"))
  
  return(false_positives_percent)
}
# This function gets the first number of Principle Components.
# It requires a pca object and the number of PC's desired.

Get.First.n.PCs <- function(pca.data, n=4) {
  # Extract the first four principal components
  first_n_pcs <- pca.data$x[, 1:n]
  
  # Create a new PCA results object with only the first four principal components
  german.pca.first.n <- pca.data
  german.pca.first.n$x <- first_n_pcs
  german.pca.first.n$rotation <- pca.data$rotation[, 1:n, drop = FALSE]
  
  return(german.pca.first.n)
}
NB.Model <- function(data) {

  # Split the data into training and testing sets on PCA
  set.seed(123) # for reproducibility
  train_index <- sample(nrow(data), nrow(data) * 0.7) # 70% for training
  train_data <- data[train_index, ]
  train_label <- as.factor(data[train_index,"Default"])
  test_data <- data[-train_index, ]
  test_label <- as.factor(data[-train_index,"Default"])
  
  # Train the Naïve Bayes classifier using the training data
  nb_model <- naive_bayes(train_data, train_label)
  
  # # Evaluate the model performance on the test set for each value of K
  nb_pred <- predict(nb_model, newdata=test_data)
  nbConf <- confusionMatrix(nb_pred, test_label)
  
  nbPredictiction <- prediction(as.numeric(nb_pred), as.numeric(test_label))
  nbPerf <- performance(nbPredictiction, measure = "tpr", x.measure = "fpr")
  nbAUC <- performance(nbPredictiction, measure = "auc")
  
  print(plot(nbPerf))
  
  # Extract performance metrics
  sensitivity <- slot(nbPerf, "y.values")[[1]]
  specificity <- 1 - slot(nbPerf, "x.values")[[1]]
  auc <- slot(nbAUC, "y.values")
  nbError <- mean(as.numeric(nb_pred) !=as.numeric(test_label))
  
  # Print performance metrics
  print(nbConf)
  print(paste0("Sensitivity: ", sensitivity))
  print(paste0("Specificity: ", specificity))
  print(paste0("AUC: ", auc))
  print(paste0("Error rate:", nbError))
  
  # Calculate false positives
  false_positives <- sum(as.numeric(nb_pred) == 2 & as.numeric(test_label) == 1)

  # Calculate false positives as a percentage
  total_negatives <- sum(as.numeric(test_label) == 1)
  false_positives_percent <- false_positives / total_negatives * 100
  
  # Print the false positives percentage
  print(paste0("False positives percentage: ", round(false_positives_percent,3), "%"))
  
  return(false_positives_percent)
}

NB w/ PCA On All Components

NB.Model.PCA(german.pca, dummy.german_std)

## NULL
## Confusion Matrix and Statistics
## 
##                     Reference
## Prediction           -0.985603470561783 1.01359221009092
##   -0.985603470561783                145                5
##   1.01359221009092                    4              146
##                                             
##                Accuracy : 0.97              
##                  95% CI : (0.9438, 0.9862)  
##     No Information Rate : 0.5033            
##     P-Value [Acc > NIR] : <2e-16            
##                                             
##                   Kappa : 0.94              
##                                             
##  Mcnemar's Test P-Value : 1                 
##                                             
##             Sensitivity : 0.9732            
##             Specificity : 0.9669            
##          Pos Pred Value : 0.9667            
##          Neg Pred Value : 0.9733            
##              Prevalence : 0.4967            
##          Detection Rate : 0.4833            
##    Detection Prevalence : 0.5000            
##       Balanced Accuracy : 0.9700            
##                                             
##        'Positive' Class : -0.985603470561783
##                                             
## [1] "Sensitivity: 0"                 "Sensitivity: 0.966887417218543"
## [3] "Sensitivity: 1"                
## [1] "Specificity: 1"                 "Specificity: 0.973154362416107"
## [3] "Specificity: 0"                
## [1] "AUC: 0.970020889817325"
## [1] "Error rate:0.03"
## [1] "False positives percentage: 2.685%"
## [1] 2.684564

NB w/ PCA On First Four Components

german.pca.first.four <- Get.First.n.PCs(german.pca, 4)
NB.Model.PCA(german.pca.first.four, dummy.german_std)

## NULL
## Confusion Matrix and Statistics
## 
##                     Reference
## Prediction           -0.985603470561783 1.01359221009092
##   -0.985603470561783                111               39
##   1.01359221009092                   38              112
##                                             
##                Accuracy : 0.7433            
##                  95% CI : (0.69, 0.7918)    
##     No Information Rate : 0.5033            
##     P-Value [Acc > NIR] : <2e-16            
##                                             
##                   Kappa : 0.4867            
##                                             
##  Mcnemar's Test P-Value : 1                 
##                                             
##             Sensitivity : 0.7450            
##             Specificity : 0.7417            
##          Pos Pred Value : 0.7400            
##          Neg Pred Value : 0.7467            
##              Prevalence : 0.4967            
##          Detection Rate : 0.3700            
##    Detection Prevalence : 0.5000            
##       Balanced Accuracy : 0.7433            
##                                             
##        'Positive' Class : -0.985603470561783
##                                             
## [1] "Sensitivity: 0"                 "Sensitivity: 0.741721854304636"
## [3] "Sensitivity: 1"                
## [1] "Specificity: 1"                "Specificity: 0.74496644295302"
## [3] "Specificity: 0"               
## [1] "AUC: 0.743344148628828"
## [1] "Error rate:0.256666666666667"
## [1] "False positives percentage: 25.503%"
## [1] 25.50336

NB w/ PCA On First Nine Components

german.pca.first.nine <- Get.First.n.PCs(german.pca, 9)
NB.Model.PCA(german.pca.first.nine, dummy.german_std)

## NULL
## Confusion Matrix and Statistics
## 
##                     Reference
## Prediction           -0.985603470561783 1.01359221009092
##   -0.985603470561783                116               34
##   1.01359221009092                   33              117
##                                             
##                Accuracy : 0.7767            
##                  95% CI : (0.7253, 0.8225)  
##     No Information Rate : 0.5033            
##     P-Value [Acc > NIR] : <2e-16            
##                                             
##                   Kappa : 0.5533            
##                                             
##  Mcnemar's Test P-Value : 1                 
##                                             
##             Sensitivity : 0.7785            
##             Specificity : 0.7748            
##          Pos Pred Value : 0.7733            
##          Neg Pred Value : 0.7800            
##              Prevalence : 0.4967            
##          Detection Rate : 0.3867            
##    Detection Prevalence : 0.5000            
##       Balanced Accuracy : 0.7767            
##                                             
##        'Positive' Class : -0.985603470561783
##                                             
## [1] "Sensitivity: 0"                 "Sensitivity: 0.774834437086093"
## [3] "Sensitivity: 1"                
## [1] "Specificity: 1"                 "Specificity: 0.778523489932886"
## [3] "Specificity: 0"                
## [1] "AUC: 0.776678963509489"
## [1] "Error rate:0.223333333333333"
## [1] "False positives percentage: 22.148%"
## [1] 22.14765

NB w/ PCA On First Twenty Components

german.pca.first.twenty <- Get.First.n.PCs(german.pca, 20)
NB.Model.PCA(german.pca.first.twenty, dummy.german_std)

## NULL
## Confusion Matrix and Statistics
## 
##                     Reference
## Prediction           -0.985603470561783 1.01359221009092
##   -0.985603470561783                120               24
##   1.01359221009092                   29              127
##                                             
##                Accuracy : 0.8233            
##                  95% CI : (0.7754, 0.8648)  
##     No Information Rate : 0.5033            
##     P-Value [Acc > NIR] : <2e-16            
##                                             
##                   Kappa : 0.6466            
##                                             
##  Mcnemar's Test P-Value : 0.5827            
##                                             
##             Sensitivity : 0.8054            
##             Specificity : 0.8411            
##          Pos Pred Value : 0.8333            
##          Neg Pred Value : 0.8141            
##              Prevalence : 0.4967            
##          Detection Rate : 0.4000            
##    Detection Prevalence : 0.4800            
##       Balanced Accuracy : 0.8232            
##                                             
##        'Positive' Class : -0.985603470561783
##                                             
## [1] "Sensitivity: 0"                 "Sensitivity: 0.841059602649007"
## [3] "Sensitivity: 1"                
## [1] "Specificity: 1"                 "Specificity: 0.805369127516778"
## [3] "Specificity: 0"                
## [1] "AUC: 0.823214365082892"
## [1] "Error rate:0.176666666666667"
## [1] "False positives percentage: 19.463%"
## [1] 19.46309
NB.Model(RFE.10.Balanced.Dummy.German)

## NULL
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 149   1
##          1   0 150
##                                           
##                Accuracy : 0.9967          
##                  95% CI : (0.9816, 0.9999)
##     No Information Rate : 0.5033          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9933          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9934          
##          Pos Pred Value : 0.9933          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.4967          
##          Detection Rate : 0.4967          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.9967          
##                                           
##        'Positive' Class : 0               
##                                           
## [1] "Sensitivity: 0"                 "Sensitivity: 0.993377483443709"
## [3] "Sensitivity: 1"                
## [1] "Specificity: 1" "Specificity: 1" "Specificity: 0"
## [1] "AUC: 0.996688741721854"
## [1] "Error rate:0.00333333333333333"
## [1] "False positives percentage: 0%"
## [1] 0

Passing PCA Results in KNN

KNN.Model.PCA <- function(pca.data, data) {
  
  # Extract the principal component scores
  pc_scores <- predict(pca.data, data)
  
  # Split the data into training and testing sets
  set.seed(123) # for reproducibility
  train_index <- sample(nrow(data), nrow(data) * 0.7) # 70% for training
  train_data <- pc_scores[train_index, ]
  train_label <- as.factor(data[train_index,"Default"])
  test_data <- pc_scores[-train_index, ]
  test_label <- as.factor(data[-train_index,"Default"])
  
  # Train the KNN classifier using the training data
  knn_model <- train(
    x = train_data,
    y = train_label,
    method = "knn",
    trControl = trainControl(method = "cv", number = 10),
    tuneGrid = data.frame(k = 1:30)
  )
  
  # Evaluate the model performance on the test set for each value of K
  knnPred <- predict(knn_model, newdata=test_data)
  knnConf <- confusionMatrix(knnPred, test_label)
  
  # Choose the K that gives the lowest test error rate
  kOpt <- knn_model$bestTune$k
  
  # Plot the tuning parameter performance
  gg <- ggplot(knn_model$results, aes(x=k, y=Accuracy)) +
    geom_line() +
    geom_point(size = 3) +
    geom_vline(xintercept=kOpt, color="red", linetype="dashed") +
    labs(title="Tuning Parameter Performance",
         x="K",
         y="Accuracy") +
    theme_minimal()
  
  print(gg)
  
  knnPredictiction <- prediction(as.numeric(knnPred), as.numeric(test_label))
  knnPerf <- performance(knnPredictiction, measure = "tpr", x.measure = "fpr")
  knnAUC <- performance(knnPredictiction, measure = "auc")
  
  print(plot(knnPerf))
  
  # Report the model performance metrics for the optimal K
  # Extract performance metrics
  sensitivity <- slot(knnPerf, "y.values")[[1]]
  specificity <- 1 - slot(knnPerf, "x.values")[[1]]
  auc <- slot(knnAUC, "y.values")
  knnError <- mean(as.numeric(knnPred) !=as.numeric(test_label))
  
  # Print performance metrics
  print(knnConf)
  print(paste0("Sensitivity: ", sensitivity))
  print(paste0("Specificity: ", specificity))
  print(paste0("AUC: ", auc))
  print(paste0("Optimal K:", kOpt))
  print(paste0("Error rate:", knnError))
  
  # Calculate false positives
  false_positives <- sum(as.numeric(knnPred) == 2 & as.numeric(test_label) == 1)

  # Calculate false positives as a percentage
  total_negatives <- sum(as.numeric(test_label) == 1)
  false_positives_percent <- false_positives / total_negatives * 100
  
  # Print the false positives percentage
  print(paste0("False positives percentage: ", round(false_positives_percent,3), "%"))
  
  return(false_positives_percent)
}
# This function takes in non PCA like data.
KNN.Model <- function(data) {
  
  # One-Hot Encoding Without Removing the First Column
  dummy <- dummyVars(" ~ .", data=data)
  dummy.data <- data.frame(predict(dummy, newdata = data)) 
  
  # Standardize the data
  dummy.data_std <- scale(dummy.data)
  
  # Split the data into training and testing sets
  set.seed(123) # for reproducibility
  train_index <- sample(nrow(dummy.data_std), nrow(dummy.data_std) * 0.7) # 70% for training
  train_data <- dummy.data_std[train_index, ]
  train_label <- as.factor(dummy.data_std[train_index,"Default"])
  test_data <- dummy.data_std[-train_index, ]
  test_label <- as.factor(dummy.data_std[-train_index,"Default"])
  
  # Train the KNN classifier using the training data
  knn_model <- train(
    x = train_data,
    y = train_label,
    method = "knn",
    trControl = trainControl(method = "cv", number = 10),
    tuneGrid = data.frame(k = 1:30)
  )
  
  # Evaluate the model performance on the test set for each value of K
  knnPred <- predict(knn_model, newdata=test_data)
  knnConf <- confusionMatrix(knnPred, test_label)
  
  # Choose the K that gives the lowest test error rate
  kOpt <- knn_model$bestTune$k
  
  # Plot the tuning parameter performance
  gg <- ggplot(knn_model$results, aes(x=k, y=Accuracy)) +
    geom_line() +
    geom_point(size = 3) +
    geom_vline(xintercept=kOpt, color="red", linetype="dashed") +
    labs(title="Tuning Parameter Performance",
         x="K",
         y="Accuracy") +
    theme_minimal()
  
  print(gg)
  
  knnPredictiction <- prediction(as.numeric(knnPred), as.numeric(test_label))
  knnPerf <- performance(knnPredictiction, measure = "tpr", x.measure = "fpr")
  knnAUC <- performance(knnPredictiction, measure = "auc")
  
  print(plot(knnPerf))
  
  # Report the model performance metrics for the optimal K
  # Extract performance metrics
  sensitivity <- slot(knnPerf, "y.values")[[1]]
  specificity <- 1 - slot(knnPerf, "x.values")[[1]]
  auc <- slot(knnAUC, "y.values")
  knnError <- mean(as.numeric(knnPred) !=as.numeric(test_label))
  
  # Print performance metrics
  print(knnConf)
  print(paste0("Sensitivity: ", sensitivity))
  print(paste0("Specificity: ", specificity))
  print(paste0("AUC: ", auc))
  print(paste0("Optimal K:", kOpt))
  print(paste0("Error rate:", knnError))
  
  # Calculate false positives
  false_positives <- sum(as.numeric(knnPred) == 2 & as.numeric(test_label) == 1)

  # Calculate false positives as a percentage
  total_negatives <- sum(as.numeric(test_label) == 1)
  false_positives_percent <- false_positives / total_negatives * 100
  
  # Print the false positives percentage
  print(paste0("False positives percentage: ", round(false_positives_percent,3), "%"))
  
  return(false_positives_percent)
}
KNN.Model.PCA(german.pca, dummy.german_std)

## NULL
## Confusion Matrix and Statistics
## 
##                     Reference
## Prediction           -0.985603470561783 1.01359221009092
##   -0.985603470561783                139               24
##   1.01359221009092                   10              127
##                                             
##                Accuracy : 0.8867            
##                  95% CI : (0.8452, 0.9202)  
##     No Information Rate : 0.5033            
##     P-Value [Acc > NIR] : < 2e-16           
##                                             
##                   Kappa : 0.7735            
##                                             
##  Mcnemar's Test P-Value : 0.02578           
##                                             
##             Sensitivity : 0.9329            
##             Specificity : 0.8411            
##          Pos Pred Value : 0.8528            
##          Neg Pred Value : 0.9270            
##              Prevalence : 0.4967            
##          Detection Rate : 0.4633            
##    Detection Prevalence : 0.5433            
##       Balanced Accuracy : 0.8870            
##                                             
##        'Positive' Class : -0.985603470561783
##                                             
## [1] "Sensitivity: 0"                 "Sensitivity: 0.841059602649007"
## [3] "Sensitivity: 1"                
## [1] "Specificity: 1"                 "Specificity: 0.932885906040268"
## [3] "Specificity: 0"                
## [1] "AUC: 0.886972754344638"
## [1] "Optimal K:24"
## [1] "Error rate:0.113333333333333"
## [1] "False positives percentage: 6.711%"
## [1] 6.711409
KNN.Model(RFE.4.German)

## NULL
## Confusion Matrix and Statistics
## 
##                     Reference
## Prediction           -0.654326261999973 1.52676127799994
##   -0.654326261999973                200                6
##   1.52676127799994                    4               90
##                                             
##                Accuracy : 0.9667            
##                  95% CI : (0.9396, 0.9839)  
##     No Information Rate : 0.68              
##     P-Value [Acc > NIR] : <2e-16            
##                                             
##                   Kappa : 0.923             
##                                             
##  Mcnemar's Test P-Value : 0.7518            
##                                             
##             Sensitivity : 0.9804            
##             Specificity : 0.9375            
##          Pos Pred Value : 0.9709            
##          Neg Pred Value : 0.9574            
##              Prevalence : 0.6800            
##          Detection Rate : 0.6667            
##    Detection Prevalence : 0.6867            
##       Balanced Accuracy : 0.9589            
##                                             
##        'Positive' Class : -0.654326261999973
##                                             
## [1] "Sensitivity: 0"      "Sensitivity: 0.9375" "Sensitivity: 1"     
## [1] "Specificity: 1"                 "Specificity: 0.980392156862745"
## [3] "Specificity: 0"                
## [1] "AUC: 0.958946078431372"
## [1] "Optimal K:2"
## [1] "Error rate:0.0333333333333333"
## [1] "False positives percentage: 1.961%"
## [1] 1.960784
KNN.Model(RFE.10.German)

## NULL
## Confusion Matrix and Statistics
## 
##                     Reference
## Prediction           -0.654326261999973 1.52676127799994
##   -0.654326261999973                192               38
##   1.52676127799994                   12               58
##                                             
##                Accuracy : 0.8333            
##                  95% CI : (0.7862, 0.8737)  
##     No Information Rate : 0.68              
##     P-Value [Acc > NIR] : 1.293e-09         
##                                             
##                   Kappa : 0.5875            
##                                             
##  Mcnemar's Test P-Value : 0.000407          
##                                             
##             Sensitivity : 0.9412            
##             Specificity : 0.6042            
##          Pos Pred Value : 0.8348            
##          Neg Pred Value : 0.8286            
##              Prevalence : 0.6800            
##          Detection Rate : 0.6400            
##    Detection Prevalence : 0.7667            
##       Balanced Accuracy : 0.7727            
##                                             
##        'Positive' Class : -0.654326261999973
##                                             
## [1] "Sensitivity: 0"                 "Sensitivity: 0.604166666666667"
## [3] "Sensitivity: 1"                
## [1] "Specificity: 1"                 "Specificity: 0.941176470588235"
## [3] "Specificity: 0"                
## [1] "AUC: 0.772671568627451"
## [1] "Optimal K:10"
## [1] "Error rate:0.166666666666667"
## [1] "False positives percentage: 5.882%"
## [1] 5.882353
KNN.Model(RFE.10.Balanced.Dummy.German)

## NULL
## Confusion Matrix and Statistics
## 
##                     Reference
## Prediction           -0.985603470561783 1.01359221009092
##   -0.985603470561783                147                3
##   1.01359221009092                    2              148
##                                             
##                Accuracy : 0.9833            
##                  95% CI : (0.9615, 0.9946)  
##     No Information Rate : 0.5033            
##     P-Value [Acc > NIR] : <2e-16            
##                                             
##                   Kappa : 0.9667            
##                                             
##  Mcnemar's Test P-Value : 1                 
##                                             
##             Sensitivity : 0.9866            
##             Specificity : 0.9801            
##          Pos Pred Value : 0.9800            
##          Neg Pred Value : 0.9867            
##              Prevalence : 0.4967            
##          Detection Rate : 0.4900            
##    Detection Prevalence : 0.5000            
##       Balanced Accuracy : 0.9834            
##                                             
##        'Positive' Class : -0.985603470561783
##                                             
## [1] "Sensitivity: 0"                 "Sensitivity: 0.980132450331126"
## [3] "Sensitivity: 1"                
## [1] "Specificity: 1"                 "Specificity: 0.986577181208054"
## [3] "Specificity: 0"                
## [1] "AUC: 0.98335481576959"
## [1] "Optimal K:8"
## [1] "Error rate:0.0166666666666667"
## [1] "False positives percentage: 1.342%"
## [1] 1.342282